{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "tAB0uxAUES2f"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer\n",
    "from sklearn import svm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 读取数据，只用到了词信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "column = \"word_seg\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "XDHFiQOHEjfD"
   },
   "outputs": [],
   "source": [
    "train_df = pd.read_csv('train_set.csv.zip',usecols=['id',column,'class'], index_col='id')\n",
    "vec = CountVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.98, binary=True)\n",
    "vec.fit(train_df[column])\n",
    "train_feather_names = vec.get_feature_names()\n",
    "del vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df = pd.read_csv('test_set.csv.zip',usecols=['id','word_seg'], index_col='id')\n",
    "vec = CountVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.98,  binary=True)\n",
    "vec.fit(test_df[column])\n",
    "test_feather_names = vec.get_feature_names()\n",
    "del vec"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 选出训练集与测试集都存在的特征，防止把分类的“注意力”放在没用的特征上"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "valide_feather_name = list(set(train_feather_names+test_feather_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3702540"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(valide_feather_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "sqNw46lsIX5q"
   },
   "source": [
    "## 尝试改进TFIDF\n",
    "### 个人认为TFIDF突出的是相对于整个语料的词特异度，而不能反应类别的特征。所以想根据类别把语料聚合起来，然后再计算idf值。事实证明这样做是有效的。大概在A榜上从0.777+提升到了0.7804。但是值得注意的是在做stack的时候，这里很容易产生leak，也可能就是这个原因，我做的stack效果奇差无比。\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "aszYPv7JIMGO"
   },
   "outputs": [],
   "source": [
    "join_df = train_df[['word_seg','class']].groupby('class').agg(lambda ele:' '.join(ele))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "SqU6NE2UIsmo"
   },
   "outputs": [],
   "source": [
    "vec = TfidfVectorizer(ngram_range=(1, 2), use_idf=1, smooth_idf=1, \n",
    "                           sublinear_tf=1,vocabulary=valide_feather_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "base_uri": "https://localhost:8080/",
     "height": 156
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 162744,
     "status": "ok",
     "timestamp": 1531713454472,
     "user": {
      "displayName": "Andrew Wang",
      "photoUrl": "//lh4.googleusercontent.com/-VE7PVDZpiAs/AAAAAAAAAAI/AAAAAAAAAQ4/Rlh3zta3Azw/s50-c-k-no/photo.jpg",
      "userId": "101015419335630229683"
     },
     "user_tz": -480
    },
    "id": "0Kv3tIgTJFTj",
    "outputId": "b2fe22ef-6bf8-40bb-bb5d-c5a30467f962"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
       "        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=1,\n",
       "        stop_words=None, strip_accents=None, sublinear_tf=1,\n",
       "        token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=1,\n",
       "        vocabulary=['816903 502544', '617854 472396', '447986 572782', '1195450 851011', '1226448 126663', '701424 657515', '566705 960615', '591310 886801', '933052 937565', '192766 378766', '905676 35621', '520477 1041972', '530030 328314', '816903 824325', '1171925 916922', '1163183 1259103', '100833 895...8682 342847', '353916 567228', '907326 859563', '912149 716116', '1090389 1164235', '520477 572373'])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vec.fit(join_df['word_seg'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "I8ZsU1UOLraA",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "train_X = vec.transform(train_df[column])\n",
    "train_Y = train_df['class']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_X = vec.transform(test_df[column])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 因为测试到数据在训练集上严重过拟合，所以增大误差容许范围，尝试降低过拟合，这样使成绩从0.7799提升到0.7804"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import svm\n",
    "clf = svm.LinearSVC(tol=1, class_weight='balanced')\n",
    "clf.fit(train_X,train_Y)\n",
    "pred = clf.predict(test_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "WKwAdV7pbsvV"
   },
   "outputs": [],
   "source": [
    "out_df = pd.DataFrame(index=test_df.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "PprmfA7Pb1mk"
   },
   "outputs": [],
   "source": [
    "out_df['class']=pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "GgSrkEPCb4CL"
   },
   "outputs": [],
   "source": [
    "out_df.to_csv('submit.csv',index=True, header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "default_view": {},
   "name": "Improve TFIDF2.ipynb",
   "provenance": [],
   "version": "0.3.2",
   "views": {}
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
